# =========================================================
# Appendix F / Table 5 — Topic Modeling tables (A25A/A25B)
# =========================================================

suppressPackageStartupMessages({
  library(readxl)
  library(quanteda)
  library(topicmodels)
  library(dplyr)
  library(stopwords)
  library(stringr)
  library(tidyr)
  library(purrr)
  library(knitr)
  library(kableExtra)
})

# -------------------------
# Paths
# -------------------------
base_dir <- "~/Dropbox/Issue voting Chile"
out_dir  <- file.path(base_dir, "09_replication/output")
dir.create(out_dir, showWarnings = FALSE, recursive = TRUE)

# Your Excel file (original location from your code)
survey_xlsx <- file.path(
  "~/Library/CloudStorage/Dropbox/Survey 2021 Chile/01_data/raw data/first wave",
  "PURDUS_227657_20211122.xlsx"
)

# -------------------------
# Load & group ideology
# -------------------------
survey <- read_excel(survey_xlsx, sheet = "PURDUS_227657")

survey <- survey %>%
  mutate(
    ideology_group = case_when(
      !is.na(A12) & A12 >= 1  & A12 <= 4  ~ "Left",
      !is.na(A12) & A12 >= 7  & A12 <= 10 ~ "Right",
      TRUE ~ NA_character_
    )
  )

# =========================================================
# Helpers
# =========================================================

make_dfm <- function(data, text_var) {
  # Build a quanteda dfm with Spanish preprocessing
  corp <- corpus(data, text_field = text_var)
  
  toks <- tokens(corp, remove_punct = TRUE, remove_numbers = TRUE)
  toks <- tokens_tolower(toks)
  toks <- tokens_remove(toks, pattern = stopwords("es"))
  toks <- tokens_remove(toks, pattern = c("na","n/a","ninguno","ninguna","nose","no","si","sí"))
  
  dfm_mat <- dfm(toks)
  dfm_mat <- dfm_trim(dfm_mat, min_termfreq = 5)
  dfm_mat <- dfm_subset(dfm_mat, ntoken(dfm_mat) > 0)
  dfm_mat
}

lda_table <- function(dfm_obj, k, seed = 2024, top_n = 15) {
  if (is.null(dfm_obj) || ndoc(dfm_obj) < k) return(NULL)
  
  dtm <- convert(dfm_obj, to = "topicmodels")
  set.seed(seed)
  m <- LDA(dtm, k = k, method = "Gibbs", control = list(seed = seed, iter = 3000))
  
  # Topic prevalence (mean document-topic proportions)
  gamma <- posterior(m)$topics  # docs x k
  mean_contrib <- colMeans(gamma)
  
  # Top-N terms per topic
  top_terms <- get_terms(m, top_n)              # matrix: top_n x k
  examples  <- apply(top_terms, 2, function(x) paste(x, collapse = ", "))
  
  tibble(
    Topic = paste0("Topic ", seq_len(k)),
    `Mean Contribution` = round(as.numeric(mean_contrib), 3),
    `Example Words (Among Top 15)` = unname(examples)
  )
}

run_panel <- function(data, text_var, k, panel_label) {
  # Three groups: Full / Left / Right
  groups <- list(
    "Full Sample" = data,
    "Left"  = dplyr::filter(data, ideology_group == "Left"),
    "Right" = dplyr::filter(data, ideology_group == "Right")
  )
  
  res <- imap(groups, function(df, gname) {
    if (nrow(df) == 0 || all(is.na(df[[text_var]]))) return(NULL)
    dfm_obj <- make_dfm(df, text_var)
    tab <- lda_table(dfm_obj, k = k)
    if (is.null(tab)) return(NULL)
    tab %>%
      mutate(Group = gname, .before = 1) %>%
      mutate(`Topic (k=)` = paste0("k=", k), .after = "Group")
  })
  
  bind_rows(res) %>%
    mutate(Panel = panel_label, .before = 1)
}

print_and_save <- function(tbl, file_stub) {
  if (is.null(tbl) || nrow(tbl) == 0) return(invisible(NULL))
  
  tbl_clean <- tbl %>%
    select(Panel, Group, `Topic (k=)`, Topic, `Mean Contribution`, `Example Words (Among Top 15)`)
  
  # Console print (no LaTeX)
  cat("\n=================================================\n")
  cat(paste0(file_stub, "\n"))
  cat("=================================================\n")
  # 6 columns -> alignment string length 6; center the numeric column
  print(kable(tbl_clean, align = "lllcll"))
  
  # Save CSV
  write.csv(tbl_clean, file.path(out_dir, paste0(file_stub, ".csv")), row.names = FALSE)
  invisible(tbl_clean)
}

# Pretty print with A/B panel headers (like manuscript Table 5)
print_manuscript_table <- function(tab_A, tab_B, title_line) {
  A <- tab_A %>% select(Group, Topic, `Mean Contribution`, `Example Words (Among Top 15)`)
  B <- tab_B %>% select(Group, Topic, `Mean Contribution`, `Example Words (Among Top 15)`)
  
  cat("\n", title_line, "\n\n")
  
  kable(bind_rows(A, B), booktabs = TRUE, align = "llcl") %>%
    pack_rows("Panel A: Ideas About 'the Left'", 1, nrow(A)) %>%
    pack_rows("Panel B: Ideas About 'the Right'", nrow(A) + 1, nrow(A) + nrow(B)) %>%
    print()
}

# =========================================================
# Build Tables
# =========================================================

# --------- k = 2 (Main text: Table 5) ----------
tab5_A_k2 <- run_panel(survey, "A25A", k = 2, panel_label = "Panel A: Ideas About 'the Left'")
tab5_B_k2 <- run_panel(survey, "A25B", k = 2, panel_label = "Panel B: Ideas About 'the Right'")

print_manuscript_table(
  tab5_A_k2, tab5_B_k2,
  "Table 5: Topic Prevalence and Representative Words for Open-Ended Questions on Ideological Labels (k=2)"
)

tab5_k2_all <- bind_rows(tab5_A_k2, tab5_B_k2)
print_and_save(tab5_k2_all, "Table5_k2_topic_prevalence_and_example_words")

# --------- k = 3 (Appendix F: Table F1) ----------
tabF1_A_k3 <- run_panel(survey, "A25A", k = 3, panel_label = "Panel A: A25A — Ideas About 'the Left'")
tabF1_B_k3 <- run_panel(survey, "A25B", k = 3, panel_label = "Panel B: A25B — Ideas About 'the Right'")

print_manuscript_table(
  tabF1_A_k3, tabF1_B_k3,
  "Appendix F — Table F1: Topic Prevalence and Representative Words (k=3)"
)

tabF1_k3_all <- bind_rows(tabF1_A_k3, tabF1_B_k3)
print_and_save(tabF1_k3_all, "AppendixF_TableF1_k3_topic_prevalence_and_example_words")

# --------- k = 4 (Appendix F: Table F2) ----------
tabF2_A_k4 <- run_panel(survey, "A25A", k = 4, panel_label = "Panel A: A25A — Ideas About 'the Left'")
tabF2_B_k4 <- run_panel(survey, "A25B", k = 4, panel_label = "Panel B: A25B — Ideas About 'the Right'")

print_manuscript_table(
  tabF2_A_k4, tabF2_B_k4,
  "Appendix F — Table F2: Topic Prevalence and Representative Words (k=4)"
)

tabF2_k4_all <- bind_rows(tabF2_A_k4, tabF2_B_k4)
print_and_save(tabF2_k4_all, "AppendixF_TableF2_k4_topic_prevalence_and_example_words")

# =========================================================
# (Optional) Semantic relabeling sketch (after inspecting example words):
#
# rename_topics <- function(tbl) {
#   tbl %>%
#     mutate(Topic = case_when(
#       str_detect(`Example Words (Among Top 15)`, "comunism|socialism|capitalism|derecha|izquierda|partid|politi") ~ "Ideological Labels",
#       str_detect(`Example Words (Among Top 15)`, "corrupt|ladr|rico|pobre|caos|violenc|elite|poder")               ~ "Value Judgement",
#       TRUE ~ Topic
#     ))
# }
# tab5_k2_all <- rename_topics(tab5_k2_all)
# print_and_save(tab5_k2_all, "Table5_k2_topic_prevalence_and_example_words_semantic")
# =========================================================
